setwd("/Users/vitorpeixoto/Documents")
bank_marketing_data_full <- read.csv("bank-additional-full.csv", sep=";",header = TRUE)
summary(bank_marketing_data_full)
## age job marital
## Min. :17.00 admin. :10422 divorced: 4612
## 1st Qu.:32.00 blue-collar: 9254 married :24928
## Median :38.00 technician : 6743 single :11568
## Mean :40.02 services : 3969 unknown : 80
## 3rd Qu.:47.00 management : 2924
## Max. :98.00 retired : 1720
## (Other) : 6156
## education default housing
## university.degree :12168 no :32588 no :18622
## high.school : 9515 unknown: 8597 unknown: 990
## basic.9y : 6045 yes : 3 yes :21576
## professional.course: 5243
## basic.4y : 4176
## basic.6y : 2292
## (Other) : 1749
## loan contact month day_of_week
## no :33950 cellular :26144 may :13769 fri:7827
## unknown: 990 telephone:15044 jul : 7174 mon:8514
## yes : 6248 aug : 6178 thu:8623
## jun : 5318 tue:8090
## nov : 4101 wed:8134
## apr : 2632
## (Other): 2016
## duration campaign pdays previous
## Min. : 0.0 Min. : 1.000 Min. : 0.0 Min. :0.000
## 1st Qu.: 102.0 1st Qu.: 1.000 1st Qu.:999.0 1st Qu.:0.000
## Median : 180.0 Median : 2.000 Median :999.0 Median :0.000
## Mean : 258.3 Mean : 2.568 Mean :962.5 Mean :0.173
## 3rd Qu.: 319.0 3rd Qu.: 3.000 3rd Qu.:999.0 3rd Qu.:0.000
## Max. :4918.0 Max. :56.000 Max. :999.0 Max. :7.000
##
## poutcome emp.var.rate cons.price.idx cons.conf.idx
## failure : 4252 Min. :-3.40000 Min. :92.20 Min. :-50.8
## nonexistent:35563 1st Qu.:-1.80000 1st Qu.:93.08 1st Qu.:-42.7
## success : 1373 Median : 1.10000 Median :93.75 Median :-41.8
## Mean : 0.08189 Mean :93.58 Mean :-40.5
## 3rd Qu.: 1.40000 3rd Qu.:93.99 3rd Qu.:-36.4
## Max. : 1.40000 Max. :94.77 Max. :-26.9
##
## euribor3m nr.employed y
## Min. :0.634 Min. :4964 no :36548
## 1st Qu.:1.344 1st Qu.:5099 yes: 4640
## Median :4.857 Median :5191
## Mean :3.621 Mean :5167
## 3rd Qu.:4.961 3rd Qu.:5228
## Max. :5.045 Max. :5228
##
First of all, let’s take a look at our dataset.
dim(bank_marketing_data_full)
## [1] 41188 21
names(bank_marketing_data_full)
## [1] "age" "job" "marital" "education"
## [5] "default" "housing" "loan" "contact"
## [9] "month" "day_of_week" "duration" "campaign"
## [13] "pdays" "previous" "poutcome" "emp.var.rate"
## [17] "cons.price.idx" "cons.conf.idx" "euribor3m" "nr.employed"
## [21] "y"
head(bank_marketing_data_full)
## age job marital education default housing loan contact month
## 1 56 housemaid married basic.4y no no no telephone may
## 2 57 services married high.school unknown no no telephone may
## 3 37 services married high.school no yes no telephone may
## 4 40 admin. married basic.6y no no no telephone may
## 5 56 services married high.school no no yes telephone may
## 6 45 services married basic.9y unknown no no telephone may
## day_of_week duration campaign pdays previous poutcome emp.var.rate
## 1 mon 261 1 999 0 nonexistent 1.1
## 2 mon 149 1 999 0 nonexistent 1.1
## 3 mon 226 1 999 0 nonexistent 1.1
## 4 mon 151 1 999 0 nonexistent 1.1
## 5 mon 307 1 999 0 nonexistent 1.1
## 6 mon 198 1 999 0 nonexistent 1.1
## cons.price.idx cons.conf.idx euribor3m nr.employed y
## 1 93.994 -36.4 4.857 5191 no
## 2 93.994 -36.4 4.857 5191 no
## 3 93.994 -36.4 4.857 5191 no
## 4 93.994 -36.4 4.857 5191 no
## 5 93.994 -36.4 4.857 5191 no
## 6 93.994 -36.4 4.857 5191 no
Then, let’s know better our variables by category.
split(names(bank_marketing_data_full),sapply(bank_marketing_data_full, function(x) paste(class(x), collapse=" ")))
## $factor
## [1] "job" "marital" "education" "default" "housing"
## [6] "loan" "contact" "month" "day_of_week" "poutcome"
## [11] "y"
##
## $integer
## [1] "age" "duration" "campaign" "pdays" "previous"
##
## $numeric
## [1] "emp.var.rate" "cons.price.idx" "cons.conf.idx" "euribor3m"
## [5] "nr.employed"
Let’s now take a look at some of the individual variables. We look at the difference between mean, median and possible outliers. Some outliers might need to be fixed.
boxplot(bank_marketing_data_full$age, main="Age",
yaxt="n", xlab="age", horizontal=TRUE,
col=16)
barplot(table(bank_marketing_data_full$job), main="Job",
col=16, las=2)
barplot(table(bank_marketing_data_full$marital), main="Marital",
col=16, las=2)
barplot(table(bank_marketing_data_full$education), main="Education",
col=16, las=2)
barplot(table(bank_marketing_data_full$default), main="Default",
col=16, las=2)
barplot(table(bank_marketing_data_full$housing), main="Housing",
col=16, las=2)
barplot(table(bank_marketing_data_full$loan), main="Loan",
col=16, las=2)
barplot(table(bank_marketing_data_full$contact), main="Contact",
col=16, las=2)
barplot(table(bank_marketing_data_full$month), main="Month",
col=16, las=2)
barplot(table(bank_marketing_data_full$day_of_week), main="Day of Week",
col=16, las=2)
boxplot(bank_marketing_data_full$duration, main="Duration",
yaxt="n", xlab="duration", horizontal=TRUE,
col=16)
boxplot(bank_marketing_data_full$campaign, main="Campaign",
yaxt="n", xlab="campaign", horizontal=TRUE,
col=16)
Duration and Campaign have some outliers, but analysing the dataset, we realised that all of them are important to the model and so, we shall keep them.
boxplot(bank_marketing_data_full$pdays, main="Days since last call (pdays)",
yaxt="n", xlab="pdays", horizontal=TRUE,
col=16)
Pdays has many outliers. This is due to the fact that if this is the first time calling the client, this variable is set as 999 so this creates a lot of 999 instances. This issue is gonna be solved later.
boxplot(bank_marketing_data_full$previous, main="Previous",
yaxt="n", xlab="previous", horizontal=TRUE,
col=16)
barplot(table(bank_marketing_data_full$poutcome), main="Previous outcome (poutcome)",
col=16, las=2)
boxplot(bank_marketing_data_full$emp.var.rate, main="Employment variation rate (emp.var.rate)",
yaxt="n", xlab="emp.var.rate", horizontal=TRUE,
col=16)
boxplot(bank_marketing_data_full$cons.price.idx, main="Consumer price index (cons.price.idx)",
yaxt="n", xlab="cons.price.idx", horizontal=TRUE,
col=16)
boxplot(bank_marketing_data_full$cons.conf.idx, main="Consumer confidence index (cons.conf.idx)",
yaxt="n", xlab="cons.conf.idx", horizontal=TRUE,
col=16)
boxplot(bank_marketing_data_full$euribor3m, main="Euribor tax at 3 months (euribor3m)",
yaxt="n", xlab="euribor3m", horizontal=TRUE,
col=16)
boxplot(bank_marketing_data_full$nr.employed, main="Number of people employed (nr.employed)",
yaxt="n", xlab="nr.employed", horizontal=TRUE,
col=16)
barplot(table(bank_marketing_data_full$y), main="Result variable (y)",
col=16, las=2)
As we talked earlier, the value ‘999’ in variable pdays means the client has not been previously contacted. That is a obvious outlier and should not be treated as numeric. Instead we must change all numbers to categories. Use ‘as.factor’ to do that. Also remove classes 25, 26 and 27 who have just one row of data and that can cause trouble when splitting into training and testing data.
bank_marketing_data_full$pdays <- factor(bank_marketing_data_full$pdays)
bank_marketing_data_full<-bank_marketing_data_full[!(bank_marketing_data_full$pdays==20 |
bank_marketing_data_full$pdays==25 |
bank_marketing_data_full$pdays==26 |
bank_marketing_data_full$pdays==27),]
barplot(table(bank_marketing_data_full$pdays), main="Days since last call (pdays)", col=16, las=2)
Check missing values for all columns, assuming that ‘unknowns’ (not equal to NAs) are treated as missing values by changing those values to NA.
bank_marketing_data_full[bank_marketing_data_full=="unknown"] <- NA
Below we can see the plot with the missing data in red.
sapply(bank_marketing_data_full, function(x) sum(is.na(x)))
## age job marital education default
## 0 330 80 1730 8597
## housing loan contact month day_of_week
## 990 990 0 0 0
## duration campaign pdays previous poutcome
## 0 0 0 0 0
## emp.var.rate cons.price.idx cons.conf.idx euribor3m nr.employed
## 0 0 0 0 0
## y
## 0
aggr_plot <- aggr(bank_marketing_data_full, col=c('blue','red'), numbers=TRUE, sortVars=TRUE, labels=names(bank_marketing_data_full), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
## Warning in plot.aggr(res, ...): not enough horizontal space to display
## frequencies
##
## Variables sorted by number of missings:
## Variable Count
## default 0.208746115
## education 0.042006605
## housing 0.024038462
## loan 0.024038462
## job 0.008012821
## marital 0.001942502
## age 0.000000000
## contact 0.000000000
## month 0.000000000
## day_of_week 0.000000000
## duration 0.000000000
## campaign 0.000000000
## pdays 0.000000000
## previous 0.000000000
## poutcome 0.000000000
## emp.var.rate 0.000000000
## cons.price.idx 0.000000000
## cons.conf.idx 0.000000000
## euribor3m 0.000000000
## nr.employed 0.000000000
## y 0.000000000
Since the dataset is so dense, we can afford to lose rows with missing data. The next step is to do so and generate again the plot of missing data, showing that no missing data exists anymore.
bank_marketing_data_full <- na.omit(bank_marketing_data_full)
aggr_plot <- aggr(bank_marketing_data_full, col=c('blue','red'), numbers=TRUE, sortVars=TRUE, labels=names(bank_marketing_data_full), cex.axis=.7, gap=3, ylab=c("Histogram of missing data","Pattern"))
##
## Variables sorted by number of missings:
## Variable Count
## age 0
## job 0
## marital 0
## education 0
## default 0
## housing 0
## loan 0
## contact 0
## month 0
## day_of_week 0
## duration 0
## campaign 0
## pdays 0
## previous 0
## poutcome 0
## emp.var.rate 0
## cons.price.idx 0
## cons.conf.idx 0
## euribor3m 0
## nr.employed 0
## y 0
One of the main problems of imbalanced data in the result variable is that it generates a model overfitted to the major class. To correct this overfitting we need to balance the samples. This can be done using oversampling by creating new instances of the minor class. Below we show the ratio of yes and no in the result variable before oversampling.
counts <- table(bank_marketing_data_full$y)
barplot(counts,col=c("blue","red"),legend = rownames(counts), main = "Term Deposit")
Now we show the ratio of yes and no in the result variable after oversampling.
bank_marketing_data_full <- ovun.sample(y ~ ., data = bank_marketing_data_full, method = "over",N = 53000)$data
Now let’s observe the barplot of the result variable. It’s much more balanced and thus less prone to overfitting.
counts <- table(bank_marketing_data_full$y)
barplot(counts,col=c("blue","red"),legend = rownames(counts), main = "Term Deposit")
Creating train and test datasets based on splitting the data in a 80/20 ratio.
set.seed(123)
sample = sample.split(bank_marketing_data_full,SplitRatio = 0.80)
train_data = subset(bank_marketing_data_full, sample==TRUE)
test_data = subset(bank_marketing_data_full, sample==FALSE)
Logistic Regression model
model<-glm(y~.,data = train_data,family = binomial)
summary(model)
##
## Call:
## glm(formula = y ~ ., family = binomial, data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.7089 -0.3859 -0.1297 0.5053 3.0889
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.744e+02 2.773e+01 -9.895 < 2e-16 ***
## age -3.132e-03 1.900e-03 -1.648 0.09927 .
## jobblue-collar -2.879e-01 6.136e-02 -4.693 2.70e-06 ***
## jobentrepreneur -1.419e-01 9.272e-02 -1.531 0.12588
## jobhousemaid 5.029e-03 1.168e-01 0.043 0.96564
## jobmanagement -1.081e-01 6.383e-02 -1.694 0.09032 .
## jobretired 5.098e-01 8.841e-02 5.767 8.07e-09 ***
## jobself-employed -1.138e-01 8.612e-02 -1.322 0.18618
## jobservices -1.620e-01 6.472e-02 -2.504 0.01229 *
## jobstudent 2.712e-01 9.336e-02 2.905 0.00367 **
## jobtechnician 2.610e-02 5.305e-02 0.492 0.62279
## jobunemployed 2.566e-01 9.825e-02 2.611 0.00902 **
## maritalmarried -2.687e-02 5.157e-02 -0.521 0.60228
## maritalsingle 8.353e-02 5.836e-02 1.431 0.15232
## educationbasic.6y 3.331e-03 1.010e-01 0.033 0.97369
## educationbasic.9y 4.321e-02 7.769e-02 0.556 0.57804
## educationhigh.school 8.668e-02 7.505e-02 1.155 0.24809
## educationilliterate 1.500e+00 6.590e-01 2.277 0.02281 *
## educationprofessional.course 1.573e-01 8.186e-02 1.921 0.05469 .
## educationuniversity.degree 3.122e-01 7.618e-02 4.099 4.16e-05 ***
## defaultyes -9.427e+00 2.293e+02 -0.041 0.96720
## housingyes 2.704e-02 3.134e-02 0.863 0.38827
## loanyes -6.539e-02 4.264e-02 -1.533 0.12519
## contacttelephone -5.215e-01 5.629e-02 -9.264 < 2e-16 ***
## monthaug 1.014e+00 9.756e-02 10.395 < 2e-16 ***
## monthdec 2.314e-02 1.804e-01 0.128 0.89793
## monthjul -1.555e-01 7.346e-02 -2.116 0.03434 *
## monthjun -1.032e+00 9.185e-02 -11.231 < 2e-16 ***
## monthmar 2.125e+00 1.145e-01 18.558 < 2e-16 ***
## monthmay -8.520e-01 6.095e-02 -13.980 < 2e-16 ***
## monthnov -7.714e-01 9.116e-02 -8.462 < 2e-16 ***
## monthoct 2.518e-01 1.177e-01 2.139 0.03246 *
## monthsep 3.465e-01 1.376e-01 2.517 0.01183 *
## day_of_weekmon 3.083e-02 5.042e-02 0.611 0.54088
## day_of_weekthu 4.799e-02 4.989e-02 0.962 0.33606
## day_of_weektue 7.524e-02 5.119e-02 1.470 0.14164
## day_of_weekwed 2.288e-01 5.046e-02 4.534 5.78e-06 ***
## duration 6.825e-03 8.060e-05 84.674 < 2e-16 ***
## campaign -3.681e-02 8.698e-03 -4.232 2.32e-05 ***
## pdays1 -4.336e-01 7.491e-01 -0.579 0.56274
## pdays2 8.692e-01 6.552e-01 1.327 0.18460
## pdays3 9.633e-01 6.101e-01 1.579 0.11434
## pdays4 -6.259e-02 6.257e-01 -0.100 0.92032
## pdays5 1.538e-01 6.836e-01 0.225 0.82197
## pdays6 6.357e-01 6.110e-01 1.040 0.29820
## pdays7 6.669e-01 6.704e-01 0.995 0.31989
## pdays8 -3.867e-01 8.162e-01 -0.474 0.63566
## pdays9 -8.869e-02 6.586e-01 -0.135 0.89288
## pdays10 5.177e-01 6.808e-01 0.760 0.44699
## pdays11 3.112e-01 7.704e-01 0.404 0.68620
## pdays12 -1.070e-01 6.665e-01 -0.160 0.87251
## pdays13 1.057e+00 7.933e-01 1.332 0.18276
## pdays14 -3.182e-01 7.688e-01 -0.414 0.67897
## pdays15 1.428e+00 8.637e-01 1.653 0.09824 .
## pdays16 -6.934e-01 1.011e+00 -0.686 0.49279
## pdays17 -2.695e+00 1.041e+00 -2.590 0.00961 **
## pdays18 -6.116e-01 1.058e+00 -0.578 0.56316
## pdays19 -2.023e+00 1.710e+00 -1.184 0.23655
## pdays21 1.045e+01 9.014e+01 0.116 0.90767
## pdays22 5.612e-01 1.202e+00 0.467 0.64051
## pdays999 -5.660e-01 6.430e-01 -0.880 0.37875
## previous -7.591e-02 5.510e-02 -1.378 0.16832
## poutcomenonexistent 4.753e-01 7.901e-02 6.016 1.79e-09 ***
## poutcomesuccess 7.268e-01 2.232e-01 3.257 0.00113 **
## emp.var.rate -2.313e+00 1.023e-01 -22.610 < 2e-16 ***
## cons.price.idx 2.577e+00 1.830e-01 14.081 < 2e-16 ***
## cons.conf.idx 6.736e-03 6.318e-03 1.066 0.28634
## euribor3m 5.823e-01 1.021e-01 5.706 1.16e-08 ***
## nr.employed 5.590e-03 2.289e-03 2.442 0.01461 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 55979 on 40380 degrees of freedom
## Residual deviance: 27160 on 40312 degrees of freedom
## AIC: 27298
##
## Number of Fisher Scoring iterations: 11
Checking variable importance for GLM. We could use the R function cor or just plot it. But since we have too many categorical variables, let’s just analyse the p-values obtained in the model and plot them. Since we want p-values below 0.05, let’s invert them by subtracting to 1. This way we will get the highest scores to the most important variables and will only take those whose p-value is above 0.95.
pvalues <- 1-summary(model)$coefficients[,4]
pvalues <- pvalues[-1]
yyy <- as.list(rep(0.95,length(pvalues)))
bp=barplot(pvalues, main="Variable importance according to p-values",col=sample(colours(), 200), las=2, cex.names=0.6, cex.axis = 0.7, mgp = c(-1, -0, -1))
lines(x=bp,y=yyy,col="blue")
We can compare both results and see if the variable importance according to the p-values checks with the variable correlation between the predictive variables and y.
pairs.panels(bank_marketing_data_full[,c(1:5,21)])
pairs.panels(bank_marketing_data_full[,c(6:10,21)])
pairs.panels(bank_marketing_data_full[,c(11:15,21)])
pairs.panels(bank_marketing_data_full[,c(16:20,21)])
test_result <- predict(model,test_data,type = "response")
test_result <- ifelse(test_result > 0.5,1,0)
test_result<-round(test_result,0)
test_result<-as.factor(test_result)
levels(test_result)<-c("no","yes")
actual1<-test_data[,21]
levels(actual1)<-c("no","yes")
conf1<-confusionMatrix(actual1,test_result,positive = "yes")
conf1
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 5358 982
## yes 795 5484
##
## Accuracy : 0.8592
## 95% CI : (0.853, 0.8652)
## No Information Rate : 0.5124
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7184
## Mcnemar's Test P-Value : 1.023e-05
##
## Sensitivity : 0.8481
## Specificity : 0.8708
## Pos Pred Value : 0.8734
## Neg Pred Value : 0.8451
## Prevalence : 0.5124
## Detection Rate : 0.4346
## Detection Prevalence : 0.4976
## Balanced Accuracy : 0.8595
##
## 'Positive' Class : yes
##
roc <- roc.curve(test_data$y, test_result, plotit = F)
pr <- prediction(as.numeric(test_result), as.numeric(test_data$y))
prf <- performance(pr, measure = "tpr", x.measure = "fpr")
plot(prf)
#Area under ROC curve
auc <- performance(pr, measure = "auc")
auc <- auc@y.values[[1]]
auc
## [1] 0.8592489
Select only most significative variables and plot them.
bank_marketing_data_sig <- bank_marketing_data_full[,c("job","education","contact","month","day_of_week","duration","campaign","poutcome","emp.var.rate","cons.price.idx","euribor3m","nr.employed","y")]
pairs.panels(bank_marketing_data_sig[,c(1:6,13)])
pairs.panels(bank_marketing_data_sig[,c(7:12,13)])
Logistic regression for the most significative variables
model_sig<-glm(y~job+education+contact+month+day_of_week+duration+campaign+poutcome+emp.var.rate+cons.price.idx+euribor3m+nr.employed, data = train_data,family = binomial)
summary(model_sig)
##
## Call:
## glm(formula = y ~ job + education + contact + month + day_of_week +
## duration + campaign + poutcome + emp.var.rate + cons.price.idx +
## euribor3m + nr.employed, family = binomial, data = train_data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -7.6972 -0.3878 -0.1303 0.5069 3.0727
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.578e+02 2.137e+01 -12.066 < 2e-16 ***
## jobblue-collar -2.779e-01 6.102e-02 -4.553 5.28e-06 ***
## jobentrepreneur -1.853e-01 9.219e-02 -2.010 0.0444 *
## jobhousemaid -2.012e-02 1.160e-01 -0.173 0.8623
## jobmanagement -1.558e-01 6.299e-02 -2.474 0.0134 *
## jobretired 4.132e-01 7.675e-02 5.383 7.31e-08 ***
## jobself-employed -1.191e-01 8.569e-02 -1.390 0.1644
## jobservices -1.585e-01 6.432e-02 -2.464 0.0137 *
## jobstudent 3.722e-01 8.904e-02 4.180 2.92e-05 ***
## jobtechnician 3.496e-02 5.275e-02 0.663 0.5075
## jobunemployed 2.700e-01 9.807e-02 2.753 0.0059 **
## educationbasic.6y 2.153e-02 9.997e-02 0.215 0.8295
## educationbasic.9y 6.139e-02 7.677e-02 0.800 0.4239
## educationhigh.school 1.239e-01 7.388e-02 1.678 0.0934 .
## educationilliterate 1.542e+00 6.506e-01 2.371 0.0178 *
## educationprofessional.course 1.862e-01 8.102e-02 2.298 0.0216 *
## educationuniversity.degree 3.689e-01 7.457e-02 4.948 7.51e-07 ***
## contacttelephone -5.083e-01 5.269e-02 -9.647 < 2e-16 ***
## monthaug 1.024e+00 9.559e-02 10.714 < 2e-16 ***
## monthdec -4.433e-02 1.788e-01 -0.248 0.8042
## monthjul -1.207e-01 7.195e-02 -1.678 0.0934 .
## monthjun -9.856e-01 8.535e-02 -11.547 < 2e-16 ***
## monthmar 2.107e+00 1.070e-01 19.697 < 2e-16 ***
## monthmay -8.412e-01 6.002e-02 -14.015 < 2e-16 ***
## monthnov -7.747e-01 8.762e-02 -8.842 < 2e-16 ***
## monthoct 2.308e-01 1.151e-01 2.005 0.0450 *
## monthsep 3.271e-01 1.331e-01 2.458 0.0140 *
## day_of_weekmon 2.801e-02 5.022e-02 0.558 0.5770
## day_of_weekthu 4.941e-02 4.963e-02 0.996 0.3195
## day_of_weektue 7.733e-02 5.089e-02 1.520 0.1286
## day_of_weekwed 2.287e-01 5.022e-02 4.554 5.27e-06 ***
## duration 6.818e-03 8.037e-05 84.831 < 2e-16 ***
## campaign -3.673e-02 8.697e-03 -4.223 2.41e-05 ***
## poutcomenonexistent 5.156e-01 4.641e-02 11.110 < 2e-16 ***
## poutcomesuccess 1.796e+00 7.459e-02 24.086 < 2e-16 ***
## emp.var.rate -2.312e+00 1.000e-01 -23.112 < 2e-16 ***
## cons.price.idx 2.479e+00 1.510e-01 16.419 < 2e-16 ***
## euribor3m 6.667e-01 7.061e-02 9.442 < 2e-16 ***
## nr.employed 3.903e-03 1.581e-03 2.468 0.0136 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 55979 on 40380 degrees of freedom
## Residual deviance: 27276 on 40342 degrees of freedom
## AIC: 27354
##
## Number of Fisher Scoring iterations: 6
test_result_sig <- predict(model_sig,test_data,type = "response")
test_result_sig <- ifelse(test_result_sig > 0.5,1,0)
test_result_sig <- round(test_result_sig,0)
test_result_sig <- as.factor(test_result_sig)
levels(test_result_sig) <- c("no","yes")
actual2 <- test_data[,21]
levels(actual2) <- c("no","yes")
conf2 <- confusionMatrix(actual2,test_result_sig,positive = "yes")
conf2
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 5359 981
## yes 811 5468
##
## Accuracy : 0.858
## 95% CI : (0.8518, 0.864)
## No Information Rate : 0.5111
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.716
## Mcnemar's Test P-Value : 6.545e-05
##
## Sensitivity : 0.8479
## Specificity : 0.8686
## Pos Pred Value : 0.8708
## Neg Pred Value : 0.8453
## Prevalence : 0.5111
## Detection Rate : 0.4333
## Detection Prevalence : 0.4976
## Balanced Accuracy : 0.8582
##
## 'Positive' Class : yes
##
roc_sig <- roc.curve(test_data$y, test_result_sig, plotit = F)
pr_sig <- prediction(as.numeric(test_result_sig), as.numeric(test_data$y))
prf_sig <- performance(pr_sig, measure = "tpr", x.measure = "fpr")
plot(prf_sig)
#Area under ROC curve
auc_sig <- performance(pr_sig, measure = "auc")
auc_sig <- auc_sig@y.values[[1]]
auc_sig
## [1] 0.8580537